library(haven) # 读取NHANES XPT文件的包
library(plyr) # 用于数据处理
library(dplyr) # 用于数据处理
library(arsenal) # 用于数据快速预览
library(survey) # 用于加权情况下的分析

setwd("D:\\NHANES DATA") # 设置工作目录
demo_g = read_xpt("2011-2012\\Demographics\\demo_g.xpt") # DEMO-人口学数据提取：2011-2012
demo_h = read_xpt("2013-2014\\Demographics\\demo_h.xpt") # DEMO-人口学数据提取：2013-2014
# dim(demo_g)
# dim(demo_h)
# colnames(demo_g) # 输出列名，即变量名称
# colnames(demo_h)
# table_1 = tableby( ~ RIDAGEYR + factor(RIAGENDR) + RIDRETH3 + DMDEDUC2 + INDFMPIR, data = demo_g)
# summary(table_1 , text = TRUE)
# 年龄-RIDAGEYR; 性别-RIAGENDR; 种族-RIDRETH3; 教育程度-DMDEDUC2; 贫困程度-INDFMPIR;
demo_data_file = dplyr::bind_rows(list(demo_g , demo_h)) # 将两个数据框中的数据进行合并，只是简单的累加，不作去重
# dim(demo_data_file)
demo_data = demo_data_file[ , c("SEQN" , "RIDAGEYR" , "RIAGENDR" , "RIDRETH3" , "DMDEDUC2" , "INDFMPIR")] # 只有其中的几个变量
# View(demo_data)
smq_g = read_xpt("2011-2012\\Questionnaire\\smq_g.xpt")
smq_h = read_xpt("2013-2014\\Questionnaire\\smq_h.xpt")
# dim(smq_g)
# dim(smq_h)
# 是否吸烟至少100支-SMQ020; 现在是否吸烟-SMQ040; 多久前开始戒烟-SMQ050Q;多久前开始戒烟的时间单位（天、周、月、年）-SMQ050U;
smp_data_file = dplyr::bind_rows(list(smq_g , smq_h))
# dim(smp_data_file)
smp_data = smp_data_file[ , c("SEQN" , "SMQ020" , "SMQ040" , "SMQ050Q" , "SMQ050U")] # 只有其中的几个变量
# dim(smp_data)
alq_g = read_xpt("2011-2012\\Questionnaire\\alq_g.xpt")
alq_h = read_xpt("2013-2014\\Questionnaire\\alq_h.xpt")
# 每年至少喝12杯酒-ALQ101;一生中至少喝过12次酒-ALQ110；过去12个月多久喝一次酒-ALQ120Q;过去12个月饮酒频率的单位（周，月，年）-ALQ120U;
alq_data_file = dplyr::bind_rows(list(alq_g , alq_h))
alq_data = alq_data_file[ , c("SEQN" , "ALQ101" , "ALQ110" , "ALQ120Q" , "ALQ120U")]
# dim(alq_data)
# BMI-BMXBMI; 腰围-BMXWAIST
bmx_g = read_xpt("2011-2012\\Examination\\bmx_g.xpt")
bmx_h = read_xpt("2013-2014\\Examination\\bmx_h.xpt")
bmx_data_file = dplyr::bind_rows(list(bmx_g , bmx_h))
bmx_data = bmx_data_file[ , c("SEQN" , "BMXBMI" , "BMXWAIST")]
# dim(bmx_data)
dr1tot_g = read_xpt("2011-2012\\Dietary\\dr1tot_g.xpt")
dr1tot_h = read_xpt("2013-2014\\Dietary\\dr1tot_h.xpt")
# DR1TLZ:叶黄素 + 玉米黄质 (mcg); DR1TKCAL:能量（千卡）
dr1tot_data_file = dplyr::bind_rows(list(dr1tot_g , dr1tot_h))
dr1tot_data = dr1tot_data_file[ , c("SEQN" , "DR1TLZ" , "DR1TKCAL")]
dr1tot_data_add_weight = dr1tot_data_file[ , c("SEQN" , "DR1TLZ" , "DR1TKCAL" , "WTDRD1" , "WTDR2D")] # 这个是第一天带权重的
# dim(dr1tot_data)
dr2tot_g = read_xpt("2011-2012\\Dietary\\dr2tot_g.xpt")
dr2tot_h = read_xpt("2013-2014\\Dietary\\dr2tot_h.xpt")
# DR2TLZ:叶黄素 + 玉米黄质 (mcg); DR2TKCAL:能量（千卡）
dr2tot_data_file = dplyr::bind_rows(list(dr2tot_g , dr2tot_h))
dr2tot_data = dr2tot_data_file[ , c("SEQN" , "DR2TLZ" , "DR2TKCAL")]
dr2tot_data_add_weight = dr2tot_data_file[ , c("SEQN" , "DR2TLZ" , "DR2TKCAL" , "WTDRD1" , "WTDR2D")] # 这个是第二天带权重的
dr_data = merge(dr1tot_data , dr2tot_data) # merge这个是拼接，默认使用两个数据框中相同列名称进行拼接。
dr_data_add_weight = merge(dr1tot_data_add_weight , dr2tot_data_add_weight)
# 使用merge进行拼接，dr1tot和dr2tot均有5个变量，拼接后变成了7个变量，这样对吗？实际验证了一下，dr1tot的WTDRD1与dr2tot的WTDRD1值是相同的，可以合并成一个。
# dim(dr_data_add_weight)
# View(dr_data_add_weight)
View(dr_data)
# dim(dr_data)
ds1tot_g = read_xpt("2011-2012\\Dietary\\ds1tot_g.xpt")
ds1tot_h = read_xpt("2013-2014\\Dietary\\ds1tot_h.xpt")
# DS1TLZ:叶黄素 + 玉米黄质 (mcg); DS2TLZ:叶黄素 + 玉米黄质 (mcg) 总膳食补充剂文件（DS1TOT_G 和 DS2TOT_G）中的变量:DS1TLZ	DS2TLZ	叶黄素 + 玉米黄质 (微克)
ds1tot_data_file = dplyr::bind_rows(list(ds1tot_g , ds1tot_h))
ds1tot_data = ds1tot_data_file[ , c("SEQN" , "DS1TLZ")]
# dim(ds1tot_data)
ds2tot_g = read_xpt("2011-2012\\Dietary\\ds2tot_g.xpt")
ds2tot_h = read_xpt("2013-2014\\Dietary\\ds2tot_h.xpt")
ds2tot_data_file = dplyr::bind_rows(list(ds2tot_g , ds2tot_h))
ds2tot_data = ds2tot_data_file[ , c("SEQN" , "DS2TLZ")]
ds_data = merge(ds1tot_data , ds2tot_data)
View(ds_data)
dim(ds_data)

cfq_g = read_xpt("2011-2012\\Questionnaire\\cfq_g.xpt")
cfq_h = read_xpt("2013-2014\\Questionnaire\\cfq_h.xpt")
cfq_data_file = dplyr::bind_rows(list(cfq_g , cfq_h))
# CFDCST1、CFDCST2、CFDCST3：词汇第1、2、3次即刻记忆测验；CFDCSR：延迟回忆评分；CFDAST：语言流畅性评分；CFDDS：数字符号替代测试
cfq_data = cfq_data_file[ , c("SEQN" , "CFDCST1" , "CFDCST2" , "CFDCST3" , "CFDCSR" , "CFDAST" , "CFDDS")]
# dim(cfq_data)
# write.csv(dr1tot_data_add_weight , "dr1tot_data_add_weight.csv")
# write.csv(dr2tot_data_add_weight , "dr2tot_data_add_weight.csv")
# View(dr_data_add_weight)
# dim(dr_data_add_weight)
weight_data = dr1tot_data_file[,c('SEQN', 'WTDRD1')]
weight_data$WTDRD1 = weight_data$WTDRD1/2 #权重的计算方式，需要按 Cycle 进行平均
# 上面的代码， 是不是需要改成下面的代码呀？因为求的是平均，那么就应该是WTDRD1加WTDR2D再除以2，现没有人回答。
# weight_data = dr1tot_data_file[,c('SEQN', 'WTDRD1', 'WTDR2D')]
# weight_data$WTDRD1 = (weight_data$WTDRD1 + weight_data$WTDR2D) / 2 #权重的计算方式，需要按 Cycle 进行平均

# SDMVPSU：Masked Variance Unit 用于方差估计的伪 PSU 变量；SDMVSTRA：用于方差估计的掩蔽方差单元伪层变量
survey_design_data = demo_data_file[ , c('SEQN', 'SDMVPSU', 'SDMVSTRA')]
# View(survey_design_data)

#demo_data, smp_data, alq_data, bmx_data, dr_data, ds_data, cfq_data, weight_data
output <- plyr::join_all(list(demo_data, smp_data, alq_data, bmx_data, dr_data, ds_data, cfq_data, weight_data, survey_design_data), by="SEQN", type="left")
# dim(output) # [1] 19931    31
View(output)

data_age_60 = subset.data.frame(output, RIDAGEYR >= 60 )
dim(data_age_60) # [1] 3632   31    ???
weight_non_na_index = which(!is.na(data_age_60$WTDRD1)) # 这是一个向量的集合，就是变量WTDRD1不为NA的集合，比如，第1、第2、第4不是NA，则值为c(1,2,4)
# View(data_age_60)
sum(data_age_60$WTDRD1[weight_non_na_index]) # 如果weight_non_na_index为c(1,2,4)，则可以写作：sum(data_age_60$WTDRD1[c(1,2,4)])
# dim(sum)

paper_data = subset.data.frame(output, RIDAGEYR >= 60 &
                                 (!is.na(CFDCSR)) & # CERAD 延迟回忆评分
                                 (!is.na(CFDAST)) & # 语言流畅性评分Animal Fluency
                                 (!is.na(CFDDS)) & # 数字符号替代测试(DSST)
                                 (!is.na(DR1TLZ))& # 饮食-叶黄素 & 玉米黄质摄取
                                 (!is.na(DR1TKCAL))& # 能量摄入总量-Energy (kcal)
                                 (!is.na(RIDAGEYR)) & # 年龄
                                 (!is.na(RIAGENDR)) & # 性别
                                 (!is.na(RIDRETH3)) & # 种族
                                 (!is.na(DMDEDUC2)) & # 教育程度
                                 (!is.na(SMQ020)) # 是否吸烟至少100支
)
dim(paper_data) # [1] 2716   31

paper_exclude_data = subset.data.frame(output, RIDAGEYR >= 60 &(
  (is.na(CFDCSR)) | # CERAD 延迟回忆评分
    (is.na(CFDAST)) |# 语言流畅性评分Animal Fluency
    (is.na(CFDDS)) | # 数字符号替代测试(DSST)
    (is.na(DR1TLZ))| #饮食-叶黄素 & 玉米黄质摄取
    (is.na(DR1TKCAL))| #能力摄入总量-Energy (kcal)
    (is.na(RIDAGEYR)) | # 年龄
    (is.na(RIAGENDR)) | # 性别
    (is.na(RIDRETH3)) | # 种族
    (is.na(DMDEDUC2)) | # 教育程度
    (is.na(SMQ020))) # 是否吸烟至少100支
)
dim(paper_exclude_data) # [1] 916  31

index_DR_na = (is.na(output$DR1TLZ) & is.na(output$DR2TLZ)) # 饮食-叶黄素 & 玉米黄质摄取
index_60_DR_na = which(output$RIDAGEYR >=60 & index_DR_na)
length(index_60_DR_na) # [1] 564

index_CFDCSR_na = is.na(output$CFDCSR) # CERAD 延迟回忆评分
index_60_DR_na = which(output$RIDAGEYR >=60 & index_CFDCSR_na)
length(index_60_DR_na) # [1] 506

# 将paper关注的2700+60岁以上有完整研究数据的人员，和排除的人员汇总在一起，并且打上标签 exclude:排除；include：包含
# View(paper_exclude_data)
paper_exclude_data$Type = "exclude"
# View(paper_exclude_data)
paper_data$Type = "include"
# View(paper_data)
compare_data = dplyr::bind_rows(list(paper_data, paper_exclude_data))
# View(compare_data)
# dim(compare_data)

# tableby 函数使用的注意事项-1，分类变量可以在前面加上-factor；
# tableby 函数使用的注意事项-2，用于分队列的变量写在前面（Type变量）；DMDEDUC2：教育程度

# tab_compare_data = tableby(Type ~ factor(DMDEDUC2) , data = compare_data) # 复现论文中的排除人群和纳入人群在人口学变量上的差异
# summary(tab_compare_data)

# RIAGENDR:性别
paper_data$Sex = ifelse(paper_data$RIAGENDR == 1 , "male" , "female")
# table(paper_data$Sex)
tab_Sex = tableby(Type ~ factor(Sex), data = paper_data, weights = WTDRD1, subset = WTDRD1 > 0)
summary(tab_Sex, text = TRUE)

# RIDAGEYR:年龄
y = paper_data$RIDAGEYR
paper_data$age_group = ifelse(y >= 60 & y <= 69, "60-69 years", ifelse(y >= 70 & y <= 79, "70-79 years", "80+ years"))
# table(paper_data$age_group)
tab_age_group = tableby(Type ~ factor(age_group), data = paper_data, weights = WTDRD1, subset = WTDRD1 > 0)
summary(tab_age_group, text=TRUE)

# RIDRETH3:种族
table(paper_data$RIDRETH3)
e = paper_data$RIDRETH3
paper_data$race = ifelse(e == 1 , "Mexican American" , 
                         ifelse(e == 2 , "Other Hispanic" , 
                                ifelse(e == 3 , "Non-Hispanic White" , 
                                       ifelse(e == 4 , "Non-Hispanic Black" , 
                                              ifelse(e == 6 , "Other/multiracial" , "Other/multiracial")))))
# table(paper_data$race)
# 上面的方法与下面的方法是等效的，其中上面的是传统的写法，下面的使用的不是R的基本命令，而是第三方的函数，不如上面的稳定。
race <- recode_factor(paper_data$RIDRETH3,
                      "1" = "Mexican American" ,
                      "2" = "Other Hispanic" ,
                      "3" = "Non-Hispanic White" ,
                      "4" = "Non-Hispanic Black" ,
                      "6" = "Other/multiracial" ,
                      "7" = "Other/multiracial")
paper_data$race <- race
table(paper_data$race)
tab_race = tableby(Type ~ factor(race), data = paper_data, weights = WTDRD1, subset = WTDRD1 > 0)
summary(tab_race, text=TRUE)

# ALQ120U:1=Week; 2=Month; 3=Year; 7=Refused; 9=Don't know; .=Missing; ALQ101：在任何一年中，至少喝过12杯酒精饮料
# ddply(paper_data, .(ALQ101, ALQ110), summarize, num = length(SEQN))
# ddply(alq_g, .(ALQ101, ALQ110), summarize, num = length(SEQN))
# table(paper_data$ALQ120U)
# ALQ120Q:0 to 350=Range of Values; 777=Refused; 999=Don't know; .=Missing;

# 统一单位为月 week-month: *4; year-month:/12
u = paper_data$ALQ120U
paper_data$trans_quantity_month = ifelse(paper_data$ALQ120Q >= 0 , paper_data$ALQ120Q * (ifelse(u == 1, 4, ifelse(u == 3, 1 / 12, ifelse(u == 7 | u == 9, NA , 1 )))) , NA)
# 下面的方法先产生了一个辅助列存表，即周转月时乘4，年转月时除以12，其它按空值处理。而上面的方法是将辅助列的计算过程直接内嵌进去了，所以用上面的即可。
# paper_data$trans_unit_month = ifelse(paper_data$ALQ120U == 1, 1 * 4,
#                               ifelse(paper_data$ALQ120U == 3, 1 / 12,
#                               ifelse(paper_data$ALQ120U == 7 | paper_data$ALQ120U == 9, NA , 1 )))
# paper_data$trans_quantity_month = ifelse(paper_data$ALQ120Q >= 0 , paper_data$ALQ120Q * paper_data$trans_unit_month , NA)
# View(paper_data)

# View(paper_data[ , c("SEQN","ALQ101","ALQ110", "ALQ120Q", "ALQ120U", "trans_quantity_month", "trans_unit_month")])
# 将饮酒量的结果划分为4档：Non-drinker, 1-5 drinks/month, 5-10 drinks/month, 10+ drinks/month
m = paper_data$trans_quantity_month
paper_data$trans_quantity_month_factor = ifelse(m >= 1 & m < 5, "1-5 drinks/month",
                                                ifelse(m >= 5 & m < 10, "5-10 drinks/month",
                                                       ifelse(m >= 10, "10+ drinks/month", "wait")))
# ddply(paper_data ,  .(ALQ101, trans_quantity_month_factor) , summarise, num = length(SEQN))
# View(paper_data[ , c("SEQN","ALQ101","ALQ110", "ALQ120Q", "ALQ120U", "trans_quantity_month", "trans_unit_month", "trans_quantity_month_factor")])

# ALQ101回答是1（说明，在任何一年中，至少喝过12杯酒精饮料），但是 trans.quantity.month.factor 没有具体的值，则将 trans.quantity.month.factor 取值为"1-5 drinks/month"
table(paper_data$trans_quantity_month_factor)
f = paper_data$trans_quantity_month_factor
paper_data$trans_quantity_month_factor[which((f == "wait" | is.na(f)) & paper_data$ALQ101 == 1)] = "1-5 drinks/month"
# table(paper_data$trans_quantity_month_factor)
# ddply(paper_data, .(ALQ101, paper_data$trans_quantity_month_factor), summarise, n = length(SEQN))
# ALQ101回答是2（说明，在任何一年中，没有喝过12杯酒精饮料，即按没有饮酒计算），那么全部等于Non-drinker（不饮酒的人），而不用管他在问卷中实际是如何回答的。
paper_data$trans_quantity_month_factor[which(paper_data$ALQ101 == 2)] = "Non-drinker"
# table(paper_data$trans_quantity_month_factor)
# View(paper_data)
paper_data$alq_group = paper_data$trans_quantity_month_factor # 最终的变量名称为alq_group，而之前的三个都是辅助变量（trans_quantity_month_factor、trans_quantity_month、trans_unit_month）
# View(paper_data)
# table(paper_data$alq_group)
# tab_alq_group = tableby(Type ~ factor(alq_group) + factor(RIAGENDR), data = paper_data, weights = WTDRD1, subset = WTDRD1 > 0)
# summary(tab_alq_group , text=TRUE)
# View(paper_data[ , c("SEQN" , "ALQ101" , "ALQ110" , "ALQ120Q" , "ALQ120U" , "trans_unit_month" , "trans_quantity_month" , "alq_group")])
# table(paper_data$DR1TKCAL)

# 卡路里-total_calories day1（DR1TKCAL）、day2（DR2TKCAL） 都有，则取2天平均，否则，取第1天的值，先验证一下第一天是否含有NA，结果没有，所以下面的代码没有验证第一天为NA的过程，直接使用了第一天的值。
# which(is.na(paper_data$DR1TKCAL)) # integer(0)
# which(is.na(paper_data$DR2TKCAL)) # 这里显示有很多，说明含有NA的值，第一天为0，说明不含有NA的值
paper_data$total_calories = ifelse(is.na(paper_data$DR2TKCAL) , paper_data$DR1TKCAL , (paper_data$DR1TKCAL + paper_data$DR2TKCAL) / 2)
# View(paper_data[ , c("DR1TKCAL" , "DR2TKCAL" , "total_calories")])

# 饮食 L&Z-total_dr_LZ day1（DR1TLZ）、day2（DR2TLZ） 都有，则取2天平均，否则，取第一天的值
# which(is.na(paper_data$DR1TLZ))
# which(is.na(paper_data$DR2TLZ))
paper_data$total_dr_LZ = ifelse(is.na(paper_data$DR2TLZ) , paper_data$DR1TLZ , (paper_data$DR1TLZ + paper_data$DR2TLZ) / 2)
# View(paper_data[ , c("DR1TLZ" , "DR2TLZ" , "total_dr_LZ")])

# 认知评分；CFDCST1、CFDCST2、CFDCST3，分别是第1、2、3次即刻记忆测验，现在求三次的累加。
# colnames(paper_data)
paper_data$CERAD_total = paper_data$CFDCST1 + paper_data$CFDCST2 + paper_data$CFDCST3
# View(paper_data[ , c("CERAD_total" , "CFDCST1" , "CFDCST2" , "CFDCST3")])

# 离散变量
nhanes_design = svydesign(data = paper_data[which(paper_data$WTDRD1 > 0) , ], id = ~ SDMVPSU, strata = ~ SDMVSTRA , weights = ~ WTDRD1 , nest = TRUE)
# summary(nhanes_design)
svytotal( ~ age_group + Sex + race + alq_group , nhanes_design , na.rm = TRUE)

# 连续性变量
svymean( ~ INDFMPIR + RIDAGEYR + BMXBMI + BMXWAIST + total_calories + total_dr_LZ + CFDCST1 + CFDCST2 + CFDCST3 + CERAD_total + CFDCSR + CFDAST + CFDDS , nhanes_design , na.rm = TRUE)
svyquantile( ~ INDFMPIR + RIDAGEYR + BMXBMI + BMXWAIST + total_calories + total_dr_LZ + CFDCST1 + CFDCST2 + CFDCST3 + CERAD_total + 
               CFDCSR + CFDAST + CFDDS , nhanes_design , na.rm = TRUE , quantiles = c(.25 , .5 , .75) , interval.type="mean")

#### 3. 课后作业-01 ####
# part1: 衍生 BMI、Smoking status、教育程度的分类变量，命名为 BMI.group, smoke.group, education.attainment，其中 smoking status 可以参考多个定义方式
# part2: 1）膳食补充剂中摄入 L&Z 的平均值的代码，supplement L and Z intake，命名为 total.ds.LZ
#        2）计算总体 L & Z 的摄入量，命名为 total.dr.ds.LZ

# View(paper_data[ , c("BMXBMI" , "BMXWAIST")])
# 衍生 BMI（变量：BMXBMI），如果是NA，按Underweight (<18.5)处理（这样是否合适？）。https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BMX_G.htm#BMXBMI
b = paper_data$BMXBMI
paper_data$BMI_group = ifelse((is.na(b) | b < 18.5), "Underweight (<18.5)",
                              ifelse(b >= 18.5 & b < 25, "Normal (18.5 to <25)",
                                     ifelse(b >= 25 & b < 30, "Overweight (25 to <30)", 
                                            ifelse(b >= 30, "Obese (30 or greater)" , "Underweight (<18.5)"))))
# View(paper_data[ , c("BMXBMI" , "BMI_group")])
# table(paper_data$BMI_group)

# SMQ040 - 你现在抽烟吗  1：每天， 2：有些日子（回答1或2按当前吸烟者）； SMQ050Q - 戒烟多久了 1-193（有此值按以前的吸烟者） SMQ020 - 一生中至少吸过 100 支香烟 2：不是（按从不吸烟）
# https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/SMQ_G.htm
paper_data$smoke_group = ifelse((paper_data$SMQ040 == 1 | paper_data$SMQ040 == 2) , "Current smoker",
                                ifelse((paper_data$SMQ050Q >= 1 & paper_data$SMQ050Q <= 193) , "Former smoker" , "Never smoker"))
# View(paper_data[ , c("SMQ020" , "SMQ040" , "SMQ050Q" , "smoke_group")])
table(paper_data$smoke_group)
table_1 = tableby( ~   factor(smoke_group) , data = paper_data)
summary(table_1 , text = TRUE)

# 教育程度 https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.htm#DMDEDUC2
d = paper_data$DMDEDUC2
paper_data$education_attainment = ifelse(d == 1 , "Less than 9th grade" , 
                                         ifelse(d == 2 , "9–11th grade (12th grade with no diploma)" , 
                                                ifelse(d == 3 , "High school graduate/GED" , 
                                                       ifelse(d == 4 , "Some college or AA" , 
                                                              ifelse(d == 5 , "College graduate or above" , "NA")))))
# View(paper_data[ , c("DMDEDUC2" , "education_attainment")])
# table(paper_data$education_attainment)

# 膳食补充剂中摄入 L&Z 的平均值的代码 https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DS1TOT_G.htm
s1 =paper_data$DS1TLZ
s2 =paper_data$DS2TLZ
paper_data$total_ds_LZ = ifelse(is.na(s1) & !is.na(s2) , s2 , 
                                ifelse(!is.na(s1) & is.na(s2) , s1 , 
                                       ifelse(!is.na(s1) & !is.na(s2) , (s1 + s2) / 2 , "NA")))
# View(paper_data[ , c("DS1TLZ" , "DS2TLZ" , "total_ds_LZ")])

# 膳食中摄入 L&Z 的平均值的代码 https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DR1TOT_G.htm
r1 =paper_data$DR1TLZ
r2 =paper_data$DR2TLZ
paper_data$total_dr_LZ = ifelse(is.na(r1) & !is.na(r2) , r2 , 
                                ifelse(!is.na(r1) & is.na(r2) , r1 , 
                                       ifelse(!is.na(r1) & !is.na(r2) , (r1 + r2) / 2 , "NA")))
# View(paper_data[ , c("DR1TLZ" , "DR2TLZ" , "total_dr_LZ")])

# 计算总体 L & Z 的摄入量。为什么要使用as.numeric？不知道为什么会产生non-numeric argument to binary operator这种错误，给加上就可以了。
paper_data[ , c("total_ds_LZ" , "total_dr_LZ")] = as.numeric(unlist(paper_data[ , c("total_ds_LZ" , "total_dr_LZ")]))
ds =paper_data$total_ds_LZ
dr =paper_data$total_dr_LZ
paper_data$total_dr_ds_LZ = ifelse(is.na(ds) & !is.na(dr) , dr , 
                                   ifelse(!is.na(ds) & is.na(dr) , ds , 
                                          ifelse(!is.na(ds) & !is.na(dr) , (ds + dr) , "NA")))
# View(paper_data[ , c("total_ds_LZ" , "total_dr_LZ" , "total_dr_ds_LZ")])

# 论文中Supplement L and Z intake (mg/day; n=471)
average_ds = mean(data_age_60$DS1TLZ , na.rm = TRUE) / 1000 # 此为求平均值，并忽略NA（比如，值分别为1.2，2.5，NA，则平均为1.85）
# print(average_ds)

# 论文中Dietary L and Z intake (mg/day)
average_dr = mean(data_age_60$DR1TLZ , na.rm = TRUE) / 1000
# print(average_dr)

# 论文中